based on the result from canberra_property_linear_regression.ipynb all the analysis is based on the new_output.csv
In [1]:
import pandas as pd
import re
import math
In [2]:
df=pd.read_csv('new_output.csv')
In [3]:
pattern=re.compile(r'\d{4}')
def add_year_to_pd(row):
date=row['sold_date']
r=pattern.search(date)
if r:
return int(r.group())
else:
return 2017
In [4]:
df['year']=df.apply(lambda row:add_year_to_pd(row),axis=1)
In [5]:
def adjust_price_3_percent_increase(row):
price=row['price_int']
year=row['year']
years_elapse=2017-year
adjusted_price=price*(math.pow(1.03,years_elapse))
return adjusted_price
In [6]:
df['adjust_price']=df.apply(lambda row:adjust_price_3_percent_increase(row),axis=1)
In [7]:
df.head()
Out[7]:
In [7]:
north=df[(df['suburb']=='Gungahlin') | (df['suburb']=='Ngunnawal') | (df['suburb']=='Harrison') | (df['suburb']=='Bonner') | (df['suburb']=='Franklin') | (df['suburb']=='Casey')]
In [8]:
north=north[north['adjust_price']<1500000]
In [9]:
north.count()
Out[9]:
In [10]:
north.head()
Out[10]:
In [12]:
import statsmodels.api as sm
In [13]:
X=north['bed']
In [14]:
X=sm.add_constant(X)
In [15]:
y=north['adjust_price']
In [16]:
linear_regression=sm.OLS(y,X)
In [17]:
fitted_model=linear_regression.fit()
In [18]:
print(fitted_model.summary())
In [19]:
print(fitted_model.params)
In [20]:
import matplotlib.pyplot as plt
import matplotlib as mpl
In [21]:
%matplotlib inline
In [22]:
x_range=[north['bed'].min(),north['bed'].max()]
In [23]:
y_range=[north['adjust_price'].min(),north['adjust_price'].max()]
In [24]:
scatter_plot=north.plot(kind='scatter',x='bed',y='adjust_price',xlim=x_range,ylim=y_range)
fitted_values = fitted_model.predict(X)
regression_line=scatter_plot.plot(north['bed'],fitted_values,'-',color='orange',linewidth=1)